from collections import Counter
from config import conf
import pandas as pd


def ana_data(path):
    data = pd.read_csv(path, sep='\t', names=['text', 'label'])
    print('数据概况')
    data.info()
    print(data.head())

    print('标签分布')
    print(Counter(data['label']))

    print('句子长度分布')
    data['text_len'] = data['text'].apply(lambda x: len(x))
    print(data['text_len'].describe())
    """
    mean + 3 * std = 30
    """


if __name__ == '__main__':
    ana_data(conf.train_path)
    ana_data(conf.test_path)
    ana_data(conf.dev_path)
